package com.alibaba.spider.util;

import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.w3c.dom.Document;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathFactory;
import java.io.IOException;

/**
 * @author fangyu
 * @version v1.0.0
 * @date 2020/10/4 5:23 下午
 */
public class JsoupHelper {
    public static Object fecthNode(String url, String xpath) throws Exception {
        String html = null;
        try {
            Connection connect = Jsoup.connect(url);
            html = connect.get().body().html();
        } catch (IOException e) {
            e.printStackTrace();
            return null;
        }
        DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
        DocumentBuilder builder = factory.newDocumentBuilder();
        Document document = builder.parse(html);
        XPathFactory xPathFactory = XPathFactory.newInstance();
        XPath xPath = xPathFactory.newXPath();
        XPathExpression expression = xPath.compile(xpath);
        return expression.evaluate(html);
    }

    public static String getContent(String url, String xpath) {
        try {
            String contents = Jsoup.connect(url).post().html();
            HtmlCleaner hc = new HtmlCleaner();
            TagNode tn = hc.clean(contents);
            Object[] list = tn.evaluateXPath(xpath);
            if (list.length > 0) {
                return list[0].toString();
            }
        } catch (IOException |org.htmlcleaner.XPatherException e) {
            e.printStackTrace();
        }
        return null;
    }

    public static void main(String[] args) throws Exception {
        System.out.println(getContent("https://www.alibaba.com/product-detail/OEM-Customized-2019-Printed-Tropical-Floral_62084397812.html?spm=a2700.galleryofferlist.normalList.1.22b16ff2lxR4py&s=p", "//meta[2]/@content"));
    }
}
